In [ ]:
# Load the data

In [14]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

train = pd.read_csv("data/dataIMG_train.csv")
test = pd.read_csv("data/dataIMG_evaluation.csv")

print train.columns, test.columns


Index([u'Country', u'Unique.Line.ID', u'Comment', u'Ease.of.Use_POS_IMG',
       u'Reliability_POS_IMG', u'Innovation_POS_IMG', u'Connectivity_POS_IMG',
       u'Image.Quality_POS_IMG', u'Workflow.Experience_POS_IMG',
       u'Dose..CT..DXR.iXR.ONLY._POS_IMG', u'Uptime.MRI.Only._POS_IMG',
       u'Brand.Reputation_POS_IMG', u'Sales.Rep_POS_IMG', u'Trust_POS_IMG',
       u'Company.Experience_POS_IMG', u'Response.time_POS_IMG',
       u'Service.Parts_POS_IMG', u'Service.Rep_POS_IMG',
       u'Training.Application.Support_POS_IMG',
       u'Overall.Repair.Quality_POS_IMG', u'Overall.Communication_POS_IMG',
       u'Equipment.Price_POS_IMG', u'Service.Price_POS_IMG',
       u'On.time.Delivery_POS_IMG', u'Installation.Phase_POS_IMG',
       u'Other_POS_IMG', u'Ease.of.Use_NEG_IMG', u'Reliability_NEG_IMG',
       u'Innovation_NEG_IMG', u'Connectivity_NEG_IMG',
       u'Image.Quality_NEG_IMG', u'Workflow.Experience_NEG_IMG',
       u'Dose..CT..DXR.iXR.ONLY._NEG_IMG', u'Uptime.MRI.Only._NEG_IMG',
       u'Brand.Reputation_NEG_IMG', u'Sales.Rep_NEG_IMG', u'Trust_NEG_IMG',
       u'Company.Experience_NEG_IMG', u'Response.time_NEG_IMG',
       u'Service.Parts_NEG_IMG', u'Service.Rep_NEG_IMG',
       u'Training.Application.Support_NEG_IMG',
       u'Overall.Repair.Quality_NEG_IMG', u'Overall.Communication_NEG_IMG',
       u'Equipment.Price_NEG_IMG', u'Service.Price_NEG_IMG',
       u'On.time.Delivery_NEG_IMG', u'Installation.Phase_NEG_IMG',
       u'Other_NEG_IMG', u'Not.Useful'],
      dtype='object') Index([u'Country', u'Unique.Line.ID', u'Comment'], dtype='object')

Correlation matrix of the target columns


In [19]:
target_cols = train[[col for col in train.columns if col not in test.columns]]
corr = target_cols.corr(method = "spearman")
sns.heatmap(corr,
            xticklabels = corr.columns.values,
            yticklabels = corr.columns.values)
plt.show()



In [20]:
print train.Country.value_counts()


USA        3426
China      2465
Germany    2014
India      1942
Japan      1794
UK         1082
France     1030
Brazil      865
Name: Country, dtype: int64

In [21]:
print target_cols.sum()


Ease.of.Use_POS_IMG                     3918
Reliability_POS_IMG                     4215
Innovation_POS_IMG                      1574
Connectivity_POS_IMG                     403
Image.Quality_POS_IMG                   5561
Workflow.Experience_POS_IMG             3837
Dose..CT..DXR.iXR.ONLY._POS_IMG          461
Uptime.MRI.Only._POS_IMG                  28
Brand.Reputation_POS_IMG                 856
Sales.Rep_POS_IMG                        161
Trust_POS_IMG                            217
Company.Experience_POS_IMG               334
Response.time_POS_IMG                    626
Service.Parts_POS_IMG                     44
Service.Rep_POS_IMG                      638
Training.Application.Support_POS_IMG     740
Overall.Repair.Quality_POS_IMG          1809
Overall.Communication_POS_IMG            212
Equipment.Price_POS_IMG                 1268
Service.Price_POS_IMG                    332
On.time.Delivery_POS_IMG                  20
Installation.Phase_POS_IMG               124
Other_POS_IMG                            212
Ease.of.Use_NEG_IMG                      924
Reliability_NEG_IMG                     1097
Innovation_NEG_IMG                       549
Connectivity_NEG_IMG                     213
Image.Quality_NEG_IMG                    932
Workflow.Experience_NEG_IMG             1316
Dose..CT..DXR.iXR.ONLY._NEG_IMG           96
Uptime.MRI.Only._NEG_IMG                  41
Brand.Reputation_NEG_IMG                  84
Sales.Rep_NEG_IMG                         74
Trust_NEG_IMG                             36
Company.Experience_NEG_IMG               120
Response.time_NEG_IMG                    292
Service.Parts_NEG_IMG                     65
Service.Rep_NEG_IMG                      177
Training.Application.Support_NEG_IMG     232
Overall.Repair.Quality_NEG_IMG           610
Overall.Communication_NEG_IMG             95
Equipment.Price_NEG_IMG                 1631
Service.Price_NEG_IMG                    416
On.time.Delivery_NEG_IMG                  31
Installation.Phase_NEG_IMG                54
Other_NEG_IMG                             73
Not.Useful                               606
dtype: int64

Can target columns be decomposed, embedded, etc?


In [ ]:
from sklearn import